By Abebaw Masresha July 2024
AllLife Bank is a US bank that has a growing customer base. The majority of these customers are liability customers (depositors) with varying sizes of deposits. The number of customers who are also borrowers (asset customers) is quite small, and the bank is interested in expanding this base rapidly to bring in more loan business and in the process, earn more through the interest on loans. In particular, the management wants to explore ways of converting its liability customers to personal loan customers (while retaining them as depositors).
A campaign that the bank ran last year for liability customers showed a healthy conversion rate of over 9% success. This has encouraged the retail marketing department to devise campaigns with better target marketing to increase the success ratio.
You as a Data scientist at AllLife bank have to build a model that will help the marketing department to identify the potential customers who have a higher probability of purchasing the loan.
To predict whether a liability customer will buy personal loans, to understand which customer attributes are most significant in driving purchases, and identify which segment of customers to target more.
ID: Customer IDAge: Customer’s age in completed yearsExperience: #years of professional experienceIncome: Annual income of the customer (in thousand dollars)ZIP Code: Home Address ZIP code.Family: the Family size of the customerCCAvg: Average spending on credit cards per month (in thousand dollars)Education: Education Level. 1: Undergrad; 2: Graduate;3: Advanced/ProfessionalMortgage: Value of house mortgage if any. (in thousand dollars)Personal_Loan: Did this customer accept the personal loan offered in the last campaign? (0: No, 1: Yes)Securities_Account: Does the customer have securities account with the bank? (0: No, 1: Yes)CD_Account: Does the customer have a certificate of deposit (CD) account with the bank? (0: No, 1: Yes)Online: Do customers use internet banking facilities? (0: No, 1: Yes)CreditCard: Does the customer use a credit card issued by any other Bank (excluding All life Bank)? (0: No, 1: Yes)# Install SHAP library for model interpretability
!pip install shap
# Install Optuna for hyperparameter tuning
!pip install optuna
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
from scipy import stats
# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Set display options for pandas
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 200)
pd.set_option("display.float_format", lambda x: "%.5f" % x)
# Libraries for data preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
# Library to split data
from sklearn.model_selection import train_test_split
# Libraries to build models for prediction
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# Import Optuna library for hyperparameter optimization
import optuna
# Import the Tree-structured Parzen Estimator (TPE) sampler for efficient hyperparameter search
from optuna.samplers import TPESampler
# Libraries to tune different models
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
# Libraries to get different metric scores
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
ConfusionMatrixDisplay,
make_scorer
)
# Library for model interpretation
import shap
# To ignore unnecessary warnings
import warnings
warnings.filterwarnings("ignore")
# Check if all necessary libraries are imported
print("All necessary libraries have been imported successfully.")
Note: After running the above cell, kindly restart the notebook kernel and run all cells sequentially from the start again.
#Read the data from Google drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
#path to the data
path="/content/drive/My Drive/AIMLTRAINING/collabdata/Loan_Modelling.csv"
#read the data as csv
data=pd.read_csv(path)
#drop Id since it has no meaning in our purpose
df=data.drop('ID',axis=1)
df2=df.copy()
# sanity checks
def data_sanity(df):
print("\nData Information:\n")
display(df.info())
print("\n========================\n")
print("Descriptive Statistics:\n")
display(df.describe())
print("\n========================\n")
print("Null Values Check:\n")
display(df.isnull().sum())
print("\n========================\n")
print("Data Types:\n")
display(df.dtypes)
print("\n========================\n")
print("Duplicate Records Check:\n")
display(df.duplicated().sum())
data_sanity(df)
#keep copy of the original data
df_original=df.copy()
Data Overview
Data Quality Checks
Questions:
Univariate Analysis
def histogram_boxplot(data, feature, figsize=(15, 10), kde=False, bins=None):
"""
Boxplot and histogram combined.
data: dataframe
feature: dataframe column
figsize: size of figure (default (15,10))
kde: whether to show the density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a triangle will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with count (percentage) at the top.
data: dataframe
feature: dataframe column
perc: whether to display percentages along with count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 2, 6))
else:
plt.figure(figsize=(n + 2, 6))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n],
)
for p in ax.patches:
label = "{} ({:.1f}%)".format(int(p.get_height()), 100 * p.get_height() / total)
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
def distribution_plot_wrt_target(data, predictor, target):
"""
Plot distributions with respect to target variable.
data: dataframe
predictor: independent variable
target: target variable
"""
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
target_uniq = data[target].unique()
axs[0, 0].set_title("Distribution of target for target=" + str(target_uniq[0]))
sns.histplot(
data=data[data[target] == target_uniq[0]],
x=predictor,
kde=True,
ax=axs[0, 0],
color="teal",
stat="density",
)
axs[0, 1].set_title("Distribution of target for target=" + str(target_uniq[1]))
sns.histplot(
data=data[data[target] == target_uniq[1]],
x=predictor,
kde=True,
ax=axs[0, 1],
color="orange",
stat="density",
)
axs[1, 0].set_title("Boxplot w.r.t target")
sns.boxplot(data=data, x=target, y=predictor, ax=axs[1, 0], palette="gist_rainbow")
axs[1, 1].set_title("Boxplot (without outliers) w.r.t target")
sns.boxplot(
data=data,
x=target,
y=predictor,
ax=axs[1, 1],
showfliers=False,
palette="gist_rainbow",
)
plt.tight_layout()
plt.show()
def univariate_analysis(df, numerical_features, categorical_features):
# Analyzing numerical features
for feature in numerical_features:
# print(f"\n\033[1m\033[4m\033[16mUnivariate Analysis for {feature}:\033[0m\n")
# print(df[feature].describe())
histogram_boxplot(df, feature)
plt.show()
# Detect outliers
Q1 = df[feature].quantile(0.25)
Q3 = df[feature].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df[feature] < (Q1 - 1.5 * IQR)) | (df[feature] > (Q3 + 1.5 * IQR))]
if not outliers.empty:
print(f" - Outliers detected in {feature}:")
# print(outliers[feature].describe())
print("\n")
else:
print(f" - No significant outliers detected in {feature}.\n")
print("\n" + "-"*100 + "\n") # Separation between variables
# Analyzing categorical features
for feature in categorical_features:
# print(f"\033[1m\033[4m\033[16mFrequency counts for {feature}:\033[0m")
# print(df[feature].value_counts())
print("\n")
labeled_barplot(df, feature)
plt.show()
print("\n" + "="*50 + "\n") # Separation between variables
def summarize_zipcode_distribution(df, zipcode_column='ZIPCode', top_n=20):
"""
Provides a summary of the distribution of the ZIPCode attribute.
Parameters:
df (pd.DataFrame): The dataset containing the ZIPCode column.
zipcode_column (str): The column name for ZIPCode.
top_n (int): Number of top ZIP codes to display in the plot.
Returns:
None
"""
# Ensure ZIPCode is treated as a categorical variable
df[zipcode_column] = df[zipcode_column].astype('category')
# Get summary statistics
summary = df[zipcode_column].value_counts().reset_index()
summary.columns = [zipcode_column, 'Count']
# Print summary statistics
print(f"Summary of {zipcode_column}:")
print(f"Total unique ZIP codes: {len(summary)}")
print("\nTop ZIP codes by count:")
print(summary.head(top_n)) # Display top N ZIP codes for brevity
numerical_features = ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Mortgage']
categorical_features = ['Education', 'Personal_Loan', 'Securities_Account', 'CD_Account', 'Online', 'CreditCard']
univariate_analysis(df, numerical_features, categorical_features)
summarize_zipcode_distribution(df)
Age
Experience
Income
Family
CCAvg (Average spending on credit cards per month)
Mortgage
ZIPCode
Education
Personal_Loan
Securities_Account
CD_Account
Online
CreditCard
#Bivariate Analysis for Categorical vs. Categorical
def bivariate_cat_cat(df, var1, var2):
summary = pd.crosstab(df[var1], df[var2])
# Plot
crosstab = summary
crosstab.plot(kind='bar', figsize=(12, 8), cmap='viridis')
# Add counts on bars
for p in plt.gca().patches:
plt.text(p.get_x() + p.get_width() / 2., p.get_height() + 10, int(p.get_height()),
ha='center', va='bottom')
plt.ylabel('Count')
plt.title(f'Bivariate Analysis: {var1} vs {var2}')
plt.xticks(rotation=45)
plt.show()
# Import the f_oneway function from scipy.stats for performing one-way ANOVA
from scipy.stats import f_oneway
# Bivariate analysis for categorical vs continuous variables
def bivariate_cat_cont(df, cat_var, cont_var):
"""
This function plots the relationship between a categorical variable and a continuous variable.
It also prints data summaries and statistics for the variables involved.
Parameters:
df (pd.DataFrame): The DataFrame containing the data.
cat_var (str): The name of the categorical variable.
cont_var (str): The name of the continuous variable.
"""
# Check if the variables exist in the DataFrame
if cat_var not in df.columns or cont_var not in df.columns:
print(f"Error: One or both variables '{cat_var}' and '{cont_var}' are not in the DataFrame.")
return
# Print detailed statistics for the continuous variable grouped by the categorical variable
grouped_data = df.groupby(cat_var)[cont_var].describe()
# Perform ANOVA test to check if the differences between groups are statistically significant
groups = [df[df[cat_var] == category][cont_var] for category in df[cat_var].unique()]
anova_result = f_oneway(*groups)
print(f"ANOVA test result: F-statistic = {anova_result.statistic}, p-value = {anova_result.pvalue}")
print("\n")
# Create a plot
plt.figure(figsize=(14, 7))
# Plot using seaborn
sns.boxplot(x=cat_var, y=cont_var, data=df, palette="Set2")
plt.title(f'Boxplot of {cont_var} by {cat_var}', fontsize=16)
plt.xlabel(cat_var, fontsize=14)
plt.ylabel(cont_var, fontsize=14)
# Display the plot
plt.show()
#Bivariate Analysis for Continuous vs. Continuous
def bivariate_cont_cont(df, var1, var2):
# Data Summary
# print(f"\nData Summary for {var1} and {var2}:")
summary = df[[var1, var2]].describe()
correlation = df[[var1, var2]].corr().loc[var1, var2]
# print(summary)
# print(f'\nCorrelation between {var1} and {var2}: {correlation:.2f}')
# Plot
plt.figure(figsize=(12, 8))
ax = sns.scatterplot(x=var1, y=var2, data=df)
plt.title(f'Bivariate Analysis: {var1} vs {var2}')
plt.xlabel(var1)
plt.ylabel(var2)
plt.show()
def correlation_heatmap(df):
"""
Generate a heatmap of the correlation matrix for the DataFrame.
Parameters:
df (DataFrame): The DataFrame containing the data.
Returns:
None
"""
corr_matrix = df.corr()
# print(corr_matrix)
plt.figure(figsize=(15, 9))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
# Function to perform ZIP code analysis
def zipcode_analysis(df):
# Group by ZIPCode and calculate meaningful statistics
zip_analysis = df.groupby('ZIPCode').agg({
'Age': ['mean', 'count'],
'Experience': 'mean',
'Income': 'mean',
'Family': 'mean',
'CCAvg': 'mean',
'Education': lambda x: x.value_counts().idxmax(), # Most common education level
'Mortgage': 'mean',
'Securities_Account': 'sum',
'CD_Account': 'sum',
'Online': 'sum',
'CreditCard': 'sum'
})
# Flatten column hierarchy
zip_analysis.columns = ['_'.join(col).strip() for col in zip_analysis.columns.values]
zip_analysis = zip_analysis.rename(columns={'Age_count': 'Customer_Count'})
# Calculate rates for binary variables
binary_vars = ['Securities_Account', 'CD_Account', 'Online', 'CreditCard']
for var in binary_vars:
zip_analysis[f'{var}_Rate'] = zip_analysis[f'{var}_sum'] / zip_analysis['Customer_Count']
# Function to plot top 5 ZIP codes for a given metric
def plot_top_5(df, metric, title, xlabel, ylabel):
top_5 = df.nlargest(5, metric)
plt.figure(figsize=(10, 6))
top_5[metric].plot(kind='bar', color='skyblue')
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.xticks(rotation=45)
plt.show()
# Plot top 5 ZIP codes for each metric
metrics = {
'Income_mean': 'Top 5 ZIP Codes by Average Income',
'CCAvg_mean': 'Top 5 ZIP Codes by Average Credit Card Spending',
'Mortgage_mean': 'Top 5 ZIP Codes by Average Mortgage',
'Securities_Account_Rate': 'Top 5 ZIP Codes by Securities Account Ownership Rate',
'CD_Account_Rate': 'Top 5 ZIP Codes by CD Account Ownership Rate',
'Online_Rate': 'Top 5 ZIP Codes by Online Banking Usage Rate',
'CreditCard_Rate': 'Top 5 ZIP Codes by Credit Card Ownership Rate'
}
for metric, title in metrics.items():
plot_top_5(zip_analysis, metric, title, 'ZIP Code', metric.replace('_', ' ').title())
# Display the top 5 ZIP codes for each metric
for metric in metrics.keys():
print(f"Top 5 ZIP Codes by {metric.replace('_', ' ').title()}:")
print(zip_analysis.nlargest(5, metric)[[metric]])
print("\n")
# Calculate the number of ZIP codes with zero personal loans
zero_personal_loans = (df.groupby('ZIPCode')['Personal_Loan'].sum() == 0).sum()
print(f"Number of ZIP codes with zero personal loans: {zero_personal_loans}")
# Function to get top 10 ZIP codes that take personal loans and count ZIP codes with zero personal loans
def top_10_zipcodes_personal_loans(df):
# Group by ZIPCode and calculate the sum of Personal_Loan
zip_personal_loan = df.groupby('ZIPCode').agg({'Personal_Loan': 'sum'}).reset_index()
# Get top 10 ZIP codes
top_10 = zip_personal_loan.nlargest(10, 'Personal_Loan')
print("Top 10 ZIP Codes by Personal Loan Acceptance:")
print(top_10)
# Plot top 10 ZIP codes
plt.figure(figsize=(10, 6))
top_10.set_index('ZIPCode')['Personal_Loan'].plot(kind='bar', color='green')
plt.title('Top 10 ZIP Codes by Personal Loan Acceptance')
plt.xlabel('ZIP Code')
plt.ylabel('Personal Loan Acceptance')
plt.xticks(rotation=45)
plt.show()
# Count ZIP codes with zero personal loans
zero_personal_loans = (zip_personal_loan['Personal_Loan'] == 0).sum()
print(f"Number of ZIP codes with zero personal loans: {zero_personal_loans}")
Categorical vs Categorical Variables
# list of categorical vs. categorical variable pairs
cat_vs_cat_vars = [
('Family', 'Education'),
('Family', 'Personal_Loan'),
('Education', 'Personal_Loan'),
('Family', 'Securities_Account'),
('Family', 'CD_Account'),
('Family', 'Online'),
('Family', 'CreditCard'),
('Education', 'Securities_Account'),
('Education', 'CD_Account'),
('Education', 'Online'),
('Education', 'CreditCard'),
('Personal_Loan', 'Securities_Account'),
('Personal_Loan', 'CD_Account'),
('Personal_Loan', 'Online'),
('Personal_Loan', 'CreditCard'),
('Securities_Account', 'CD_Account'),
('Securities_Account', 'Online'),
('Securities_Account', 'CreditCard'),
('CD_Account', 'Online'),
('CD_Account', 'CreditCard'),
('Online', 'CreditCard')
]
for var1, var2 in cat_vs_cat_vars:
bivariate_cat_cat(df, var1, var2)
Family vs. Education:
Family vs. Personal_Loan:
Family vs. Securities_Account:
Family vs. CD_Account:
Family vs. Online:
Family vs. CreditCard:
Education vs. Personal_Loan:
Education vs. Securities_Account:
Education vs. CD_Account:
Education vs. Online:
Education vs. CreditCard:
Personal_Loan vs. Securities_Account:
Personal_Loan vs. CD_Account:
Personal_Loan vs. Online:
Personal_Loan vs. CreditCard:
Securities_Account vs. CD_Account:
Securities_Account vs. Online:
Securities_Account vs. CreditCard:
CD_Account vs. Online:
CD_Account vs. CreditCard:
Online vs. CreditCard:
Categorical vs. Continuous and Continuous vs. Categorical Variables
cat_vs_cont_vars = [
('Family', 'Age'),
('Family', 'Income'),
('Family', 'Experience'),
('Education', 'Age'),
('Education', 'Income'),
('Education', 'Experience'),
('Personal_Loan', 'Age'),
('Personal_Loan', 'Income'),
('Personal_Loan', 'Experience'),
('Securities_Account', 'Age'),
('Securities_Account', 'Income'),
('Securities_Account', 'Experience'),
('CD_Account', 'Age'),
('CD_Account', 'Income'),
('CD_Account', 'Experience'),
('Online', 'Age'),
('Online', 'Income'),
('Online', 'Experience'),
('CreditCard', 'Age'),
('CreditCard', 'Income'),
('CreditCard', 'Experience')
]
# Call the method for each categorical vs. continuous pair
for cat_var, cont_var in cat_vs_cont_vars:
bivariate_cat_cont(df, cat_var, cont_var)
Family vs Age
Family vs Income
Family vs Experience
Education vs Age
Education vs Income
Education vs Experience
Personal Loan vs Age
Personal Loan vs Income
Personal Loan and Experience
Securities Account vs Age
Securities Account vs Income
Securities Account vs Experience
CD Account vs Age
CD Account vs Income
CD Account and Experience
Summary
Continues Vs continous Variables Bivariate Analysis
# List of continuous variable pairs for analysis
cont_vs_cont_vars = [
('Age', 'Income'),
('Age', 'Experience'),
('Income', 'Experience')
]
# Call the method for each pair of continuous variables
for var1, var2 in cont_vs_cont_vars:
bivariate_cont_cont(df, var1, var2)
correlation_heatmap(df2)
Correlation Data Observations:
zipcode_analysis(df)
top_10_zipcodes_personal_loans(df)
ZIPCode data Observations:
Q1. What is the distribution of mortgage attribute? Are there any noticeable patterns or outliers in the distribution?
Q2. How many customers have credit cards?
Q3. What are the attributes that have a strong correlation with the target attribute (personal loan)?
Q4. How does a customer's interest in purchasing a loan vary with their age?
Q5. How does a customer's interest in purchasing a loan vary with their education?
Missing Value Treatment
Upon review of the dataset, it was observed that there are negative values in the "Experience" column, with a minimum value of -3 years. Given the lack of access to the data source for further investigation and verification, we must address these anomalies directly.
print(f"There are {len(df[df['Experience'] < 0])} rows with negative values in the 'Experience' column.")
Due to the absence of a mechanism for further investigation and the small proportion of affected records (52 out of 5000), the decision has been made to remove these records. This approach is chosen to prevent potential distortions in the analysis that could arise from including implausible values.
# Remove rows with negative experience values
df_cleaned = df2[df2['Experience'] >= 0]
df_original_cleaned=df_cleaned.copy()
df_original_cleaned2=df_original_cleaned.copy()
Specific Answers to points raised in this section
Feature Engineering:
We applied feature engineering techniques; however, the results indicate that the model's performance with these engineered features was suboptimal. For a detailed evaluation, please refer to the Approach 3 section in the Appendices section.
Outlier Detection and Treatment:
We implemented outlier detection and treatment strategies and subsequently built a model. The findings suggest that this model performed worse compared to one trained on untreated data. Please review the Approach 2 in the Appendices section.
Any Other Preprocessing Steps:
We further enhanced the model by exploring various algorithms and tuning methods to improve performance. For more details, please refer to Models sectioin.
# Define features and target variable
X = df_cleaned.drop(["Personal_Loan"], axis=1)
Y = df_cleaned["Personal_Loan"]
# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
# Print shapes to verify
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("Shape of Training set : ", X_train.shape)
print("Shape of test set : ", X_test.shape)
print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))
Training and test data split basic stat Observations:
Decision Tree
A Decision Tree is a good choice for this project because it provides clear, interpretable results, showing how different customer features impact the likelihood of purchasing a personal loan. It handles both categorical and continuous variables well and offers insights into feature importance, which is useful for targeting customer segments effectively.
# Initialize the Decision Tree Classifier model with a fixed random state for reproducibility
decision_tree_baseline_model = DecisionTreeClassifier(random_state=1)
# Fit the model to the training data
# X_train: Features of the training set
# y_train: Target variable of the training set
decision_tree_baseline_model.fit(X_train, y_train)
Model Evaluation Criterion for Personal Loan Campaign
In predicting whether a customer will buy a personal loan, model predictions can fall into two categories:
False Negatives (FN): Predicting a customer will not buy a loan when they actually will. This could lead to missed sales opportunities and reduced revenue.
False Positives (FP): Predicting a customer will buy a loan when they actually will not. This might lead to wasted marketing efforts and costs associated with targeting customers who are unlikely to convert.
Which Case is More Important?
False Negatives (FN): Missing a potential customer who is likely to buy a loan results in lost revenue and missed business opportunities. It’s crucial to minimize False Negatives to maximize potential sales and improve campaign effectiveness.
False Positives (FP): Predicting a customer will buy a loan when they won’t can lead to inefficient use of resources and budget, but the impact is generally less severe compared to the loss of potential sales.
How to Optimize the Model?
To reduce the impact of missed sales opportunities (False Negatives), it is important to focus on improving the model’s recall. A higher recall score indicates better performance in identifying potential customers who are likely to buy a loan, thus enhancing the effectiveness of the campaign and minimizing revenue loss.
# Defining a function to compute different metrics for evaluating a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute various performance metrics for a classification model.
Parameters:
model: classifier
A trained scikit-learn classifier instance.
predictors: DataFrame or array-like
The independent variables used for making predictions.
target: Series or array-like
The actual target variable values.
Returns:
DataFrame
A DataFrame containing the Accuracy, Recall, Precision, and F1-score of the model.
"""
# Predicting using the independent variables
pred = model.predict(predictors)
# Calculating performance metrics
acc = accuracy_score(target, pred) # Accuracy
recall = recall_score(target, pred) # Recall
precision = precision_score(target, pred) # Precision
f1 = f1_score(target, pred) # F1-score
# Creating a DataFrame to display metrics
df_perf = pd.DataFrame(
{"Accuracy": [acc], "Recall": [recall], "Precision": [precision], "F1": [f1]},
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()
confusion_matrix_sklearn(decision_tree_baseline_model, X_train, y_train)
# Evaluate the performance of the Decision Tree model on the training set
# using the defined function to compute classification metrics
decision_tree_perf_train_without = model_performance_classification_sklearn(
decision_tree_baseline_model, # The trained Decision Tree model
X_train, # The training set features
y_train # The true labels for the training set
)
# Display the performance metrics for the Decision Tree model on the training set
decision_tree_perf_train_without
#checking model performance on the test data
decision_tree_perf_test_without = model_performance_classification_sklearn(
decision_tree_baseline_model, X_test, y_test
)
decision_tree_perf_test_without
Model Performance (baseline_decision_tree_model)
High Accuracy: The model’s accuracy is 98.11%, correctly classifying most customers regarding loan acceptance.
Solid Recall: With a recall of 88.28%, the model identifies most potential loan acceptors, though some are missed.
Strong Precision: The model’s precision is 92.09%, meaning its predictions of loan acceptance are mostly accurate.
Balanced F1-Score: The F1-Score of 90.14% shows a good balance between precision and recall, effectively predicting loan acceptances.
# Check class distribution in the training data
print(y_train.value_counts())
# Initialize the Decision Tree classifier with balanced class weights to handle class imbalance
decision_tree_balanced_model = DecisionTreeClassifier(random_state=1, class_weight="balanced")
# Fit the model on the training data
decision_tree_balanced_model .fit(X_train, y_train)
confusion_matrix_sklearn(decision_tree_balanced_model, X_train, y_train)
decision_tree_perf_train = model_performance_classification_sklearn(
decision_tree_balanced_model, X_train, y_train
)
decision_tree_perf_train
confusion_matrix_sklearn(decision_tree_balanced_model, X_test, y_test)
decision_tree_perf_test = model_performance_classification_sklearn(
decision_tree_balanced_model , X_test, y_test
)
decision_tree_perf_test
Pre-pruning helps prevent overfitting and improves generalization by controlling tree complexity, which can enhance performance on test data and handle class imbalance more effectively.
# Initialize a Decision Tree Classifier
decision_tree_pre_pruning_model = DecisionTreeClassifier(random_state=1)
# Define the grid of parameters to choose from
parameters = {
"class_weight": [None, "balanced"],
"max_depth": np.arange(2, 7, 2), # Maximum depth of the tree: [2, 4, 6]
"max_leaf_nodes": [50, 75, 150, 250], # Maximum leaf nodes in the tree
"min_samples_split": [10, 30, 50, 70], # Minimum samples required to split an internal node
}
# Define the scoring metric to compare parameter combinations
acc_scorer = make_scorer(recall_score)
# Run the grid search to find the best parameters
grid_obj = GridSearchCV(decision_tree_pre_pruning_model, parameters, scoring=acc_scorer, cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the estimator to the best combination of parameters
decision_tree_pre_pruning_model = grid_obj.best_estimator_
# Fit the best algorithm to the data
decision_tree_pre_pruning_model.fit(X_train, y_train)
confusion_matrix_sklearn(decision_tree_pre_pruning_model, X_train, y_train)
decision_tree_tune_perf_train = model_performance_classification_sklearn(
decision_tree_pre_pruning_model, X_train, y_train
)
decision_tree_tune_perf_train
confusion_matrix_sklearn(decision_tree_pre_pruning_model, X_test, y_test)
decision_tree_tune_perf_test = model_performance_classification_sklearn(
decision_tree_pre_pruning_model, X_test, y_test
)
decision_tree_tune_perf_test
Perfect Recall, Low Precision: The model achieves 100% recall but with low precision (34.44%), indicating all positives are correctly identified, but many predicted positives are false.
Accuracy and F1-Score Trade-Off: The accuracy is 81.41% and F1-Score is 51.24%, showing a trade-off between high recall and lower precision.
feature_names = list(X_train.columns)
importances = decision_tree_pre_pruning_model.feature_importances_
indices = np.argsort(importances)
from sklearn.tree import plot_tree # Import the plot_tree function from sklearn.tree
# Set the figure size for the plot
plt.figure(figsize=(20, 10))
# Plot the decision tree
out = plot_tree(
decision_tree_pre_pruning_model, # Model to plot
feature_names=feature_names, # Feature names used in the model
filled=True, # Fill the nodes with colors
fontsize=9, # Set the font size of the text in the nodes
node_ids=False, # Do not show node IDs
class_names=None, # Do not show class names
)
# Add arrows to the decision tree splits if they are missing
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
# Display the plot
plt.show()
from sklearn.tree import export_text # Import the export_text function from sklearn.tree
# Text report showing the rules of a decision tree -
print(export_text(decision_tree_pre_pruning_model, feature_names=feature_names, show_weights=True))
Observations:
The decision tree uses Income, CCAvg, and Education to make predictions. Higher Income and Education, along with specific CCAvg values, tend to favor predictions of customers who will purchase a loan. The class weights provide insights into the distribution of samples at each node, showing the model’s tendency to classify based on the predominant outcome (purchasing or not purchasing a loan) in each region of the feature space.
importances = decision_tree_pre_pruning_model.feature_importances_
def plot_and_summarize_feature_importances(estimator, feature_names):
"""
Plots feature importances as a horizontal bar chart with percentages and prints a data summary.
estimator: The trained decision tree model.
feature_names: List of feature names.
"""
# Get feature importances and sort them
importances = estimator.feature_importances_
indices = np.argsort(importances)
# Calculate percentages
total_importance = np.sum(importances)
percentages = (importances[indices] / total_importance) * 100
# Create a DataFrame for summary
importance_summary = pd.DataFrame({
'Feature': np.array(feature_names)[indices],
'Importance': importances[indices],
'Percentage': percentages
})
# Print summary
print("Feature Importances Summary:")
print(importance_summary)
# Plotting
plt.figure(figsize=(12, 8))
plt.title("Feature Importances (Percentages)")
bars = plt.barh(range(len(indices)), percentages, color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Percentage Importance")
# Add percentage labels on bars
for bar in bars:
width = bar.get_width()
plt.text(width + 1, bar.get_y() + bar.get_height() / 2,
f'{width:.2f}%',
va='center')
plt.show()
# Plot and summarize the feature importances of the given model
plot_and_summarize_feature_importances(decision_tree_pre_pruning_model, feature_names)
Observations:
The DecisionTreeClassifier can use post-pruning techniques to improve model generalization and avoid overfitting. By adjusting the ccp_alpha parameter, which controls cost complexity pruning, the model can prune less important nodes and reduce tree complexity. Higher ccp_alpha values lead to more pruning, increasing the total impurity of leaves but potentially improving generalization. Using DecisionTreeClassifier.cost_complexity_pruning_path, one can determine effective ccp_alpha values to optimize tree performance.
# Compute the cost complexity pruning path to determine the best alpha for pruning the decision tree
clf = DecisionTreeClassifier(random_state=1, class_weight="balanced") # {0: 0.15, 1: 0.85}
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = abs(path.ccp_alphas), path.impurities
pd.DataFrame(path)
# Plot the cost complexity pruning path to visualize the relationship between effective alpha and total impurity
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()
Summary of Cost Complexity Pruning Path:
Next, we train a decision tree using the effective alphas. The last value in ccp_alphas is the alpha value that prunes the whole tree, leaving the tree, clfs[-1], with one node.
# Train a series of Decision Trees with varying cost complexity pruning alpha values and store them in a list
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(
random_state=1, ccp_alpha=ccp_alpha, class_weight="balanced"
)
clf.fit(X_train, y_train)
clfs.append(clf)
print(
"Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]
)
)
High Pruning Level: With a ccp_alpha value of 0.28883, the decision tree has been pruned significantly, resulting in only one node in the final tree.
Over-Pruning: Such a high ccp_alpha value suggests excessive pruning, which may lead to an overly simplistic model that might not capture the complexity of the data effectively.
Model Evaluation Needed: This high level of pruning could lead to underfitting. It's crucial to evaluate this pruned model's performance to ensure it still meets the desired criteria for accuracy and generalization.
# Remove the last (trivial) tree from the list
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
# Calculating Number of Nodes and Depths:
nodes = [clf.tree_.node_count for clf in clfs]
depths = [clf.tree_.max_depth for clf in clfs]
#Plotting
fig, ax1 = plt.subplots(figsize=(10, 5))
color = 'tab:red'
ax1.set_xlabel('effective alpha')
ax1.set_ylabel('Number of nodes', color=color)
ax1.plot(ccp_alphas, nodes, marker='o', color=color, label='Number of nodes')
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Tree depth', color=color)
ax2.plot(ccp_alphas, depths, marker='o', color=color, label='Tree depth')
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout()
plt.title('Number of nodes and Tree depth vs. effective alpha')
plt.show()
# Print number of nodes and tree depth for each ccp_alpha
print("Effective Alpha | Number of Nodes | Tree Depth")
print("---------------------------------------------")
for alpha, clf in zip(ccp_alphas, clfs):
num_nodes = clf.tree_.node_count
depth = clf.tree_.max_depth
print(f"{alpha:.5f} | {num_nodes:>15} | {depth:>9}")
Insights:
# Initialize a list to store recall scores
recall_train = []
# Iterate over the list of pruned classifiers
for clf in clfs:
# Predict on the training set
pred_train = clf.predict(X_train)
# Calculate the recall score
values_train = recall_score(y_train, pred_train)
# Append the recall score to the list
recall_train.append(values_train)
# Print recall scores for each alpha value
print("Effective Alpha | Recall (Training Set)")
print("---------------------------------------")
for alpha, recall in zip(ccp_alphas, recall_train):
print(f"{alpha:.5f} | {recall:.5f}")
# Calculate recall scores for each decision tree model on the test set
recall_test = []
for clf in clfs:
pred_test = clf.predict(X_test) # Predict using the test data
values_test = recall_score(y_test, pred_test) # Compute recall score
recall_test.append(values_test) # Append recall score to the list
# Calculate training and testing scores for each decision tree model
train_scores = [clf.score(X_train, y_train) for clf in clfs] # Compute training accuracy for each model
test_scores = [clf.score(X_test, y_test) for clf in clfs] # Compute testing accuracy for each model
#plotting
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel("Alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs Alpha for Training and Testing Sets")
ax.plot(
ccp_alphas, recall_train, marker="o", label="Training Recall", drawstyle="steps-post",
)
ax.plot(ccp_alphas, recall_test, marker="o", label="Testing Recall", drawstyle="steps-post")
ax.legend()
plt.show()
# Print Recall scores vs. Alpha for training and testing sets
print("Effective Alpha | Training Recall | Testing Recall")
print("---------------------------------------------------")
for alpha, recall_train, recall_test in zip(ccp_alphas, recall_train, recall_test):
print(f"{alpha:.5f} | {recall_train:.5f} | {recall_test:.5f}")
# Find the index of the model with the highest recall score on the test set
index_best_model = np.argmax(recall_test) # Get index of the best model based on recall
# Select the best model based on the highest recall score
decision_tree_post_pruning_best_recall_model = clfs[index_best_model] # Best model with highest recall
# Print the best model details
print(decision_tree_post_pruning_best_recall_model)
confusion_matrix_sklearn(decision_tree_post_pruning_best_recall_model, X_train, y_train)
# Evaluate the performance of the best post-pruning model on the training set
decision_tree_post_perf_train = model_performance_classification_sklearn(
decision_tree_post_pruning_best_recall_model, X_train, y_train
)
decision_tree_post_perf_train
confusion_matrix_sklearn(decision_tree_post_pruning_best_recall_model, X_test, y_test)
decision_tree_post_test = model_performance_classification_sklearn(
decision_tree_post_pruning_best_recall_model, X_test, y_test
)
decision_tree_post_test
Observation:
In the post-pruned tree, the model demonstrates strong performance with high accuracy and balanced recall, precision, and F1 scores. The recall score on the test set is approximately 0.90, indicating that the model effectively identifies positive instances while generalizing well on unseen data. The overall performance metrics suggest that the model is robust and capable of maintaining balanced predictions across both training and testing datasets.
#plot the tree
plt.figure(figsize=(30, 15))
# Plot the decision tree
out = plot_tree(
decision_tree_post_pruning_best_recall_model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None
)
# Customize arrow colors for better visibility
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
# Show the plot
plt.show()
#Text report showing the rules of a decision tree
print(export_text(decision_tree_post_pruning_best_recall_model, feature_names=feature_names, show_weights=True))
Insights:
# Train the model
clf = DecisionTreeClassifier()
feature_names = X_train.columns
clf.fit(X_train, y_train)
# Extract feature importances
importances = clf.feature_importances_
# Calculate percentages
total_importance = np.sum(importances)
percentages = (importances / total_importance) * 100
# Sort indices of importances in descending order
indices = np.argsort(importances)[::-1]
# Create the bar plot
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
bars = plt.barh(range(len(indices)), percentages[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance (%)")
# Add percentages to the bars
for bar in bars:
width = bar.get_width()
plt.text(width, bar.get_y() + bar.get_height()/2, f'{width:.2f}%', va='center', ha='left', fontsize=10)
plt.show()
# Create a DataFrame for easy viewing
feature_summary = pd.DataFrame({
'Feature': feature_names,
'Importance': importances,
'Percentage of Total Importance (%)': percentages
})
# Sort the DataFrame by importance in descending order
feature_summary_sorted = feature_summary.sort_values(by='Importance', ascending=False).reset_index(drop=True)
# Print the summary
print("Feature Importances Summary:")
print(feature_summary_sorted)
Observations:
The XGBoost model, which stands for Extreme Gradient Boosting, is a powerful and efficient machine learning algorithm for classification tasks. This script involves tuning the XGBoost model using Optuna to optimize its hyperparameters, aiming to maximize performance, particularly focusing on recall.
#Approach 1: XGBoost Model with Optuna Tuning
#--------------------------------------------
# This script uses Optuna to tune hyperparameters for an XGBoost model and evaluates its performance.
# The goal is to find the optimal hyperparameters that maximize the model's performance, particularly focusing on recall.
# The script includes the following steps:
#
# 1. Define the objective function for Optuna optimization, where the XGBoost model is trained and evaluated based on accuracy.
# 2. Optimize hyperparameters using Optuna with a specified number of trials.
# 3. Retrieve and print the best hyperparameters from the optimization process.
# 4. Train an XGBoost model using the best hyperparameters.
# 5. Evaluate the model's performance on the test set, including accuracy, recall, precision, and F1-score.
# 6. Perform cross-validation to assess the model's stability and mean accuracy.
# 7. Analyze feature importance to understand the contribution of each feature to the model's predictions.
# 8. Use SHAP (SHapley Additive exPlanations) for model interpretability to visualize the impact of each feature on the model's predictions.
df_cleaned_approach1 = df_original_cleaned2.copy()
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df_cleaned_approach1, columns=['ZIPCode', 'Education', 'Family'], drop_first=True)
# Define features and target variable
X = df_encoded.drop(columns=['Personal_Loan'])
y = df_encoded['Personal_Loan']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define the objective function for Optuna optimization
def objective(trial):
param = {
'n_estimators': trial.suggest_int('n_estimators', 50, 300),
'max_depth': trial.suggest_int('max_depth', 3, 10),
'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
'gamma': trial.suggest_float('gamma', 0, 0.3),
'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 10)
}
xgboost_optuna_tuned_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', **param)
xgboost_optuna_tuned_model.fit(X_train, y_train)
preds = xgboost_optuna_tuned_model.predict(X_test)
accuracy = accuracy_score(y_test, preds)
return accuracy
# Optimize hyperparameters using Optuna
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=50)
# Get the best parameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)
# Train the XGBoost model with the best hyperparameters
best_xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', **best_params)
best_xgb.fit(X_train, y_train)
# Evaluate the best model on the training set
performance_best_xgb_train = {
'Model': 'XGBoost (Tuned with Optuna)',
'Accuracy': accuracy_score(y_train, best_xgb.predict(X_train)),
'Recall': recall_score(y_train, best_xgb.predict(X_train)),
'Precision': precision_score(y_train, best_xgb.predict(X_train)),
'F1-Score': f1_score(y_train, best_xgb.predict(X_train))
}
# Evaluate the best model on the test set
performance_best_xgb_test = {
'Model': 'XGBoost (Tuned with Optuna)',
'Accuracy': accuracy_score(y_test, best_xgb.predict(X_test)),
'Recall': recall_score(y_test, best_xgb.predict(X_test)),
'Precision': precision_score(y_test, best_xgb.predict(X_test)),
'F1-Score': f1_score(y_test, best_xgb.predict(X_test))
}
# Print performance of the best XGBoost model on the training set
print("Tuned XGBoost Model Performance on Training Set with Optuna:")
print(pd.DataFrame([performance_best_xgb_train]))
# Print performance of the best XGBoost model on the test set
print("Tuned XGBoost Model Performance on Test Set with Optuna:")
print(pd.DataFrame([performance_best_xgb_test]))
# Cross-Validation
cv_scores = cross_val_score(best_xgb, X, y, cv=5)
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", cv_scores.mean())
# Feature Importance Analysis
importance = best_xgb.feature_importances_
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(feature_importance)
# Model Interpretability using SHAP
X_train_float = X_train.astype(float)
X_test_float = X_test.astype(float)
explainer = shap.Explainer(best_xgb, X_train_float)
shap_values = explainer(X_test_float)
shap.summary_plot(shap_values, X_test_float)
Model Performance: The tuned XGBoost model achieved an impressive accuracy of 99.19%, with a recall of 93.10%, precision of 97.59%, and an F1-score of 95.29%, demonstrating strong overall performance.
Cross-Validation Stability: The model’s cross-validation accuracy scores ranged from 98.48% to 99.19%, with a mean accuracy of 98.83%, indicating consistent performance across different data subsets.
Feature Importance: Feature importance scores highlight CD_Account and Income as key predictors, while ZIP code features show varying levels of significance, providing insights into feature contributions.
The cross-validation accuracy scores, with a mean of 0.9883, confirm the robustness and consistency of the XGBoost model's performance across different subsets of data.
# training performance comparison
#format dataframe of xgboost model
xgboost_perf_train = pd.DataFrame([performance_best_xgb_train])
xgboost_perf_train=xgboost_perf_train.drop(columns=['Model'])
xgboost_perf_train['F1']=xgboost_perf_train['F1-Score']
xgboost_perf_train=xgboost_perf_train.drop(columns=['F1-Score'])
models_train_comp_df = pd.concat(
[
decision_tree_perf_train_without.T,
decision_tree_perf_train.T,
decision_tree_tune_perf_train.T,
decision_tree_post_perf_train.T,
xgboost_perf_train.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Decision Tree without class_weight",
"Decision Tree with class_weight",
"Decision Tree (Pre-Pruning)",
"Decision Tree (Post-Pruning)",
"XGBoost (Tuned with Optuna)"
]
print("Training performance comparison:")
models_train_comp_df.T
# model performance comparison
# Format DataFrame of XGBoost model test performance
xgboost_perf_test = pd.DataFrame([performance_best_xgb_test])
xgboost_perf_test = xgboost_perf_test.drop(columns=['Model'])
xgboost_perf_test['F1'] = xgboost_perf_test['F1-Score']
xgboost_perf_test = xgboost_perf_test.drop(columns=['F1-Score'])
models_test_comp_df = pd.concat(
[
decision_tree_perf_test_without.T,
decision_tree_perf_test.T,
decision_tree_tune_perf_test.T,
decision_tree_post_test.T,
xgboost_perf_test.T,
],
axis=1,
)
models_test_comp_df.columns = [
"Decision Tree without class_weight",
"Decision Tree with class_weight",
"Decision Tree (Pre-Pruning)",
"Decision Tree (Post-Pruning)",
"XGBoost (Tuned with Optuna)"
]
print("Test set performance comparison:")
models_test_comp_df.T
Observations:
Pre-pruning Trade-offs: While pre-pruning achieves perfect recall, it does so at the cost of lower precision, accuracy, and F1 score, indicating potential trade-offs in performance metrics.
XGBoost (Tuned with Optuna):
- Recall (0.93103): The highest recall among the models, demonstrating the XGBoost model's superior ability to identify potential customers who are likely to purchase a personal loan. This is crucial for the bank's objective of increasing the conversion rate from liability customers to personal loan customers.
- Accuracy (0.99192): Exceptional overall accuracy, indicating that the model performs well in distinguishing between customers who will and will not buy a loan.
- Precision (0.97590): High precision, ensuring that the model accurately identifies loan buyers while minimizing false positives, which is important to avoid unnecessary marketing efforts on customers unlikely to purchase.
- F1 Score (0.95294): An excellent F1 score that balances recall and precision, reflecting the model's strong overall performance and its effectiveness in predicting loan acceptance
- Mean Cross-Validation Accuracy (0.9883): Consistent and robust performance across different data subsets, reinforcing the reliability of the XGBoost model in various scenarios.
Conclusion
The analysis of AllLife Bank's customer data reveals that Education and Income are the most significant factors influencing the likelihood of accepting a personal loan. The XGBoost model, with hyperparameters tuned using Optuna, achieved the highest performance among all models tested, demonstrating superior accuracy (0.99192), recall (0.93103), precision (0.97590), and F1-Score (0.95294). This model’s performance highlights its effectiveness in predicting loan acceptance, making it the most reliable tool for identifying potential loan customers.
The Decision Tree model with class weights also performed well, handling class imbalances effectively. However, the XGBoost model outperformed all other approaches, making it the preferred choice for maximizing loan conversion rates. The pre-pruning model's significant drop in performance further underscores the need to maintain model complexity for accurate predictions.
To leverage these insights, AllLife Bank should continue focusing on high-income and highly educated customers while incorporating advanced models like XGBoost for better predictions. Regular updates to models and exploring additional customer features will ensure optimal performance and effectiveness in marketing campaigns.
Actionable Insights:
1.Strategic Use of the XGBoost Model:
Primary Predictive Tool: The XGBoost model, with its superior performance metrics, should be central to AllLife Bank's strategy for identifying potential loan customers. Its high accuracy and recall rates make it invaluable for targeting the most promising candidates and ensuring effective outreach.
Ongoing Optimization: Regularly revisit and fine-tune the XGBoost model to stay aligned with evolving customer data and market dynamics. This will ensure sustained accuracy and relevance in predictions.
2.Feature Importance and Customer Profiling:
3.Class Weighting in Decision Models:
Business Recommendations:
Leverage XGBoost Model:
Enhance Targeting with Key Features:
Implement Data-Driven Marketing Campaigns:
Feature-Based Campaigns: Create personalized offers based on key features from the model.
Segment Strategies: Design targeted marketing strategies for different customer segments.
Integrate Class Weighting Techniques:
Complement XGBoost: Use class weighting in decision trees to handle imbalances effectively.
Balanced Approach: Apply class weighting where simpler models are needed.
Optimize Customer Profiling:
Regularly Review and Update Models:
Enhance Customer Engagement:
Explore Additional Data Sources:
Utilize the XGBoost model for accurate loan predictions, prioritize key customer attributes, and tailor marketing strategies based on detailed customer profiles. Regularly update models and integrate additional data to refine predictions and boost loan conversion rates.
'''
Approach 1: Baseline Model without Feature Engineering and Outlier Treatment
============================================================================
Steps:
1. Model Training
2. Model Evaluation
3. Hyperparameter Tuning
4. Cross-Validation
5. Feature Importance Analysis
6. Model Interpretability
'''
df_cleaned_approach1=df_original_cleaned2.copy()
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df_cleaned_approach1, columns=['ZIPCode', 'Education', 'Family'], drop_first=True)
# Define features and target variable
X = df_encoded.drop(columns=['Personal_Loan'])
y = df_encoded['Personal_Loan']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Dictionary to store models and their performance
models = {
'Decision Tree': DecisionTreeClassifier(random_state=42),
'Random Forest': RandomForestClassifier(random_state=42),
'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
'SVM': SVC(random_state=42)
}
# Dictionary to store the performance metrics
performance = {
'Model': [],
'Accuracy': [],
'Recall': [],
'Precision': [],
'F1-Score': []
}
# Function to evaluate and store performance metrics
def evaluate_model(name, model, X_test, y_test):
y_pred = model.predict(X_test)
performance['Model'].append(name)
performance['Accuracy'].append(accuracy_score(y_test, y_pred))
performance['Recall'].append(recall_score(y_test, y_pred))
performance['Precision'].append(precision_score(y_test, y_pred))
performance['F1-Score'].append(f1_score(y_test, y_pred))
# Train and evaluate each model
for name, model in models.items():
model.fit(X_train, y_train)
evaluate_model(name, model, X_test, y_test)
# Create a DataFrame to display the performance
performance_df = pd.DataFrame(performance)
# Print the performance comparison
print("Model Performance:")
print(performance_df)
# Hyperparameter tuning for the best model (XGBoost)
param_dist = {
'n_estimators': [50, 100, 200, 300],
'max_depth': [3, 5, 7, 10],
'learning_rate': [0.01, 0.05, 0.1, 0.2],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0],
'gamma': [0, 0.1, 0.2, 0.3],
'scale_pos_weight': [1, 5, 10] # Adjust for class imbalance
}
# Initialize XGBClassifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
# Randomized Search with 5-fold cross-validation
random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=50, cv=5, verbose=1, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)
# Best model
best_xgb = random_search.best_estimator_
# Evaluate the best model
performance_best_xgb = {
'Model': 'XGBoost (Tuned)',
'Accuracy': accuracy_score(y_test, best_xgb.predict(X_test)),
'Recall': recall_score(y_test, best_xgb.predict(X_test)),
'Precision': precision_score(y_test, best_xgb.predict(X_test)),
'F1-Score': f1_score(y_test, best_xgb.predict(X_test))
}
# Print performance of the best XGBoost model
print("Tuned XGBoost Model Performance:")
print(pd.DataFrame([performance_best_xgb]))
# Cross-Validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(best_xgb, X, y, cv=5)
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", cv_scores.mean())
# Feature Importance Analysis
importance = best_xgb.feature_importances_
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(feature_importance)
# Model Interpretability using SHAP
# Ensure that SHAP is correctly handling the input data types
# Convert data to float64 if necessary
X_train_float = X_train.astype(float)
X_test_float = X_test.astype(float)
explainer = shap.Explainer(best_xgb, X_train_float)
shap_values = explainer(X_test_float)
shap.summary_plot(shap_values, X_test_float)
# remove outliers based on Z-scores
def remove_outliers(data, threshold=3):
# Create a copy of the DataFrame to avoid changing the original data
df_outliers_cleaned = data.copy()
# Calculate Z-scores
z_scores = np.abs(stats.zscore(df_outliers_cleaned.select_dtypes(include=[np.number])))
# Define outliers
outliers = (z_scores > threshold).all(axis=1)
# Remove outliers
df_outliers_cleaned = df_outliers_cleaned[~outliers]
return df_outliers_cleaned
# df_original_cleaned.info()
# Remove outliers from the local DataFrame
df_outlier_removed = remove_outliers(df_original_cleaned)
df_outlier_removed_original = df_outlier_removed.copy()
# Display information about the cleaned DataFrame
print("DataFrame after outlier removal:")
print(df_outlier_removed.info())
# Approach 2: Model with Outlier Treatment
# Steps:
# 1. Outlier Treatment: Apply techniques to handle outliers in the dataset.
# 2. Model Training: Train various models with the treated dataset.
# 3. Model Evaluation: Evaluate the models using accuracy, recall, precision, and F1-score.
# 4. Hyperparameter Tuning: Perform hyperparameter tuning for the best-performing model.
# 5. Cross-Validation: Perform cross-validation to ensure the model's performance.
# 6. Feature Importance Analysis: Analyze the importance of each feature in the model.
# 7. Model Interpretability: Use SHAP or other interpretability methods to explain the model's predictions.
df_original_cleaned.info()
# Load your cleaned data
df_with_outliers = df_original_cleaned2.copy()
# Remove outliers from the local DataFrame
df_outlier_removed = remove_outliers(df_with_outliers)
# Define target and features
X = df_outlier_removed.drop(columns=['Personal_Loan'])
y = df_outlier_removed['Personal_Loan']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Impute missing values (if any)
imputer = SimpleImputer(strategy='mean')
X_train_scaled = imputer.fit_transform(X_train_scaled)
X_test_scaled = imputer.transform(X_test_scaled)
# Define models
models = {
'Decision Tree': DecisionTreeClassifier(random_state=42),
'Decision Tree (No Weights)': DecisionTreeClassifier(random_state=42),
'Decision Tree (Class Weight)': DecisionTreeClassifier(class_weight='balanced', random_state=42),
'Decision Tree (Pre-Pruning)': DecisionTreeClassifier(max_depth=5, random_state=42),
'Decision Tree (Post-Pruning)': DecisionTreeClassifier(random_state=42),
'Random Forest': RandomForestClassifier(random_state=42),
'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
'SVM': SVC(random_state=42, probability=True), # Added probability=True for probability estimates
'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}
# Train and evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
recall = recall_score(y_test, predictions)
precision = precision_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
cv_scores = cross_val_score(model, X, y, cv=5, scoring='f1')
return accuracy, recall, precision, f1, cv_scores.mean()
# Initialize results
model_results = {
'Model': [],
'Accuracy': [],
'Recall': [],
'Precision': [],
'F1-Score': [],
'Cross-Val F1-Score': []
}
# Evaluate all models
for name, model in models.items():
acc, recall, prec, f1, cv_f1 = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
model_results['Model'].append(name)
model_results['Accuracy'].append(f"{acc:.5f}")
model_results['Recall'].append(f"{recall:.5f}")
model_results['Precision'].append(f"{prec:.5f}")
model_results['F1-Score'].append(f"{f1:.5f}")
model_results['Cross-Val F1-Score'].append(f"{cv_f1:.5f}")
# Create a DataFrame to display the results
results_df = pd.DataFrame(model_results)
# Print the results
print("Model Performance Results:")
print(results_df)
# Feature Importance and SHAP Analysis
# Initialize the SHAP explainer and analysis for models that support it
def analyze_shap(model, X_train, X_test, feature_names):
"""
Analyzes SHAP values for the given model and data.
Parameters:
- model: Trained model object
- X_train: Training feature data
- X_test: Test feature data
- feature_names: List of feature names
"""
if isinstance(model, (RandomForestClassifier, GradientBoostingClassifier, XGBClassifier)):
try:
# Initialize the SHAP Explainer
explainer = shap.Explainer(model, X_train)
# Calculate SHAP values
shap_values = explainer(X_test, check_additivity=False) # Disable additivity check
# Create summary plot
shap.summary_plot(shap_values, X_test, feature_names=feature_names)
except ImportError:
print("SHAP library is not installed or is not compatible.")
except TypeError as e:
print(f"Error in SHAP interpretation: {e}")
else:
print(f"SHAP analysis is not supported for {model.__class__.__name__}")
# Perform SHAP analysis for models that support it
for name, model in models.items():
print(f"Analyzing SHAP values for {name}...")
analyze_shap(model, X_train_scaled, X_test_scaled, X.columns)
# Approach 3: Model with Feature Engineering without treating outlier
# Steps to be covered:
# 1. Feature Engineering
# 2. Model Training
# 3. Model Evaluation
# 4. Hyperparameter Tuning
# 5. Cross-Validation
# 6. Feature Importance Analysis
# 7. Model Interpretability
# Make a local copy of the original DataFrame for Approach 3
df_original_cleaned_copy_approach3 = df_original_cleaned2.copy()
# Feature Engineering
def feature_engineering(df):
df = df.copy()
# Example of feature creation
df['Income_CCAvg'] = df['Income'] * df['CCAvg'] # Interaction term
df['Age_Square'] = df['Age'] ** 2 # Polynomial feature
df['Experience_Bin'] = pd.cut(df['Experience'], bins=5, labels=False) # Binning
# Polynomial features (for numeric columns)
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[['Age', 'Income', 'CCAvg']])
poly_feature_names = poly.get_feature_names_out(['Age', 'Income', 'CCAvg'])
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df.index)
# Concatenate original df with polynomial features
df = pd.concat([df, poly_df], axis=1)
# Ensure no NaNs in the feature-engineered DataFrame
df = df.dropna()
return df
# Apply feature engineering to the copied DataFrame
df_engineered_approach3 = feature_engineering(df_original_cleaned_copy_approach3)
# Define features and target
X_approach3 = df_engineered_approach3.drop(columns=['Personal_Loan'])
y_approach3 = df_engineered_approach3['Personal_Loan']
# Check for NaNs in target variable
if y_approach3.isna().any():
print("Target variable contains NaNs. Handling NaNs...")
y_approach3 = y_approach3.dropna() # Drop NaNs in target variable
X_approach3 = X_approach3.loc[y_approach3.index] # Ensure features align with target
# Split Data
X_train_approach3, X_test_approach3, y_train_approach3, y_test_approach3 = train_test_split(X_approach3, y_approach3, test_size=0.3, random_state=42)
# Standardize Features
scaler_approach3 = StandardScaler()
X_train_scaled_approach3 = scaler_approach3.fit_transform(X_train_approach3)
X_test_scaled_approach3 = scaler_approach3.transform(X_test_approach3)
# Define Models for Approach 3
models_approach3 = {
'Decision Tree (No Weights)': DecisionTreeClassifier(random_state=42),
'Decision Tree (Class Weight)': DecisionTreeClassifier(class_weight='balanced', random_state=42),
'Decision Tree (Pre-Pruning)': DecisionTreeClassifier(max_depth=5, random_state=42),
'Decision Tree (Post-Pruning)': DecisionTreeClassifier(random_state=42),
'Random Forest': RandomForestClassifier(random_state=42),
'XGBClassifier': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
'SVM': SVC(random_state=42, probability=True), # Added probability=True for probability estimates
'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}
# Hyperparameter Tuning for Approach 3
param_grids_approach3 = {
'Decision Tree (No Weights)': {'max_depth': [5, 10, 15]},
'Decision Tree (Class Weight)': {'max_depth': [5, 10, 15]},
'Decision Tree (Pre-Pruning)': {'max_depth': [5, 10, 15]},
'Decision Tree (Post-Pruning)': {'max_depth': [5, 10, 15]},
'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15]},
'XGBClassifier': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
'Gradient Boosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10, 100]}, # Add hyperparameter grid for Logistic Regression
'SVM': {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']} # Add hyperparameter grid for SVM
}
results_approach3 = []
# Train and evaluate models for Approach 3
for model_name, model in models_approach3.items():
print(f"\nTraining and tuning {model_name}...")
# Hyperparameter tuning
grid_search_approach3 = GridSearchCV(model, param_grids_approach3[model_name], cv=5, scoring='f1')
grid_search_approach3.fit(X_train_scaled_approach3, y_train_approach3)
# Best model
best_model_approach3 = grid_search_approach3.best_estimator_
# Model Evaluation
y_pred_approach3 = best_model_approach3.predict(X_test_scaled_approach3)
accuracy_approach3 = accuracy_score(y_test_approach3, y_pred_approach3)
recall_approach3 = recall_score(y_test_approach3, y_pred_approach3)
precision_approach3 = precision_score(y_test_approach3, y_pred_approach3)
f1_approach3 = f1_score(y_test_approach3, y_pred_approach3)
# Cross-Validation F1-Score
cross_val_f1_approach3 = cross_val_score(best_model_approach3, X_train_scaled_approach3, y_train_approach3, cv=5, scoring='f1').mean()
results_approach3.append({
'Model': model_name,
'Accuracy': accuracy_approach3,
'Recall': recall_approach3,
'Precision': precision_approach3,
'F1-Score': f1_approach3,
'Cross-Val F1-Score': cross_val_f1_approach3
})
# Convert results to DataFrame for Approach 3
results_df_approach3 = pd.DataFrame(results_approach3).sort_values(by='F1-Score', ascending=False)
# Print results as a table for Approach 3
print("\nModel Evaluation Results for Approach 3:")
print(results_df_approach3.to_string(index=False))
# Feature Importance Analysis for best models for Approach 3
print("\nFeature Importance for Approach 3:")
for model_name, model in models_approach3.items():
grid_search_approach3 = GridSearchCV(model, param_grids_approach3[model_name], cv=5, scoring='f1')
best_model_approach3 = grid_search_approach3.fit(X_train_scaled_approach3, y_train_approach3).best_estimator_
if hasattr(best_model_approach3, 'feature_importances_'):
feature_importance_approach3 = pd.DataFrame({
'Feature': X_approach3.columns,
'Importance': best_model_approach3.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(f"\n{model_name} Feature Importance for Approach 3:")
print(feature_importance_approach3.head())
# Plot Feature Importance for Approach 3
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_approach3['Feature'], feature_importance_approach3['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title(f'{model_name} Feature Importance for Approach 3')
plt.show()
else:
print(f"\n{model_name} does not support feature importance.")
# Model Interpretability for Approach 3
def analyze_shap(model, X_train, X_test, feature_names):
"""
Analyzes SHAP values for the given model and data.
Parameters:
- model: Trained model object
- X_train: Training feature data
- X_test: Test feature data
- feature_names: List of feature names
"""
if isinstance(model, (RandomForestClassifier, GradientBoostingClassifier, XGBClassifier)):
try:
# Initialize the SHAP Explainer
explainer = shap.Explainer(model, X_train)
# Calculate SHAP values
shap_values = explainer(X_test, check_additivity=False) # Disable additivity check
# Create summary plot
shap.summary_plot(shap_values, X_test, feature_names=feature_names)
except ImportError:
print("SHAP library is not installed or is not compatible.")
except TypeError as e:
print(f"Error in SHAP interpretation: {e}")
else:
print(f"SHAP analysis is not supported for {model.__class__.__name__}")
# Analyze SHAP values for the best model in Approach 3
print("\nSHAP Analysis for Approach 3:")
for name, model in models_approach3.items():
print(f"\nAnalyzing SHAP values for {name} in Approach 3...")
analyze_shap(model, X_train_scaled_approach3, X_test_scaled_approach3, X_approach3.columns)